import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly as py
import hvplot.pandas
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (
accuracy_score, confusion_matrix, classification_report,
roc_auc_score, roc_curve, auc,
plot_confusion_matrix, plot_roc_curve, mean_squared_error, r2_score, mean_absolute_error
)
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
pd.set_option('display.float', '{:.2f}'.format)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import feature_engine as f
train = pd.read_csv("labelencoder2.csv")
# train = pd.read_csv("pddummies2.csv")
print(train.shape)
train.head(2)
(177414, 16)
| price | year | condition | cylinders | fuel | odometer | title_status | transmission | drive | type | state | MSRP | quarter | car_age | is_vintage | is_color_neutral | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 22590 | 2010.00 | 2 | 4 | 2 | 71229.00 | 0 | 2 | 0 | 8 | 1 | 46110.00 | 2 | 11.00 | 0 | 0 |
| 1 | 35000 | 2019.00 | 0 | 3 | 2 | 43000.00 | 0 | 0 | 0 | 10 | 1 | 27355.00 | 2 | 2.00 | 0 | 1 |
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 177414 entries, 0 to 177413 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 price 177414 non-null int64 1 year 177414 non-null float64 2 condition 177414 non-null int64 3 cylinders 177414 non-null int64 4 fuel 177414 non-null int64 5 odometer 177414 non-null float64 6 title_status 177414 non-null int64 7 transmission 177414 non-null int64 8 drive 177414 non-null int64 9 type 177414 non-null int64 10 state 177414 non-null int64 11 MSRP 177414 non-null float64 12 quarter 177414 non-null int64 13 car_age 177414 non-null float64 14 is_vintage 177414 non-null int64 15 is_color_neutral 177414 non-null int64 dtypes: float64(4), int64(12) memory usage: 21.7 MB
train['condition'] = train['condition'].astype(int)
train['year'] = train['year'].astype(int)
train['odometer'] = train['odometer'].astype(int)
train['MSRP'] = train['MSRP'].astype(int)
train['car_age'] = train['car_age'].astype(int)
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 177414 entries, 0 to 177413 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 price 177414 non-null int64 1 year 177414 non-null int32 2 condition 177414 non-null int32 3 cylinders 177414 non-null int64 4 fuel 177414 non-null int64 5 odometer 177414 non-null int32 6 title_status 177414 non-null int64 7 transmission 177414 non-null int64 8 drive 177414 non-null int64 9 type 177414 non-null int64 10 state 177414 non-null int64 11 MSRP 177414 non-null int32 12 quarter 177414 non-null int64 13 car_age 177414 non-null int32 14 is_vintage 177414 non-null int64 15 is_color_neutral 177414 non-null int64 dtypes: int32(5), int64(11) memory usage: 18.3 MB
train.describe()
| price | year | condition | cylinders | fuel | odometer | title_status | transmission | drive | type | state | MSRP | quarter | car_age | is_vintage | is_color_neutral | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 177414.00 | 177414.00 | 177414.00 | 177414.00 | 177414.00 | 177414.00 | 177414.00 | 177414.00 | 177414.00 | 177414.00 | 177414.00 | 177414.00 | 177414.00 | 177414.00 | 177414.00 | 177414.00 |
| mean | 65849.87 | 2011.36 | 0.96 | 2.36 | 2.03 | 102761.39 | 0.14 | 0.10 | 0.63 | 5.57 | 23.73 | 33552.67 | 2.00 | 9.64 | 0.01 | 0.49 |
| std | 12994626.23 | 7.18 | 0.97 | 1.23 | 0.45 | 66073.92 | 0.76 | 0.36 | 0.70 | 4.30 | 14.98 | 14626.61 | 0.00 | 7.18 | 0.07 | 0.50 |
| min | 0.00 | 1900.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 2000.00 | 2.00 | -1.00 | 0.00 | 0.00 |
| 25% | 4500.00 | 2008.00 | 0.00 | 1.00 | 2.00 | 55853.50 | 0.00 | 0.00 | 0.00 | 0.00 | 9.00 | 24340.00 | 2.00 | 5.00 | 0.00 | 0.00 |
| 50% | 9995.00 | 2013.00 | 1.00 | 3.00 | 2.00 | 99195.50 | 0.00 | 0.00 | 1.00 | 8.00 | 23.00 | 30225.00 | 2.00 | 8.00 | 0.00 | 0.00 |
| 75% | 18998.00 | 2016.00 | 2.00 | 3.00 | 2.00 | 140659.00 | 0.00 | 0.00 | 1.00 | 9.00 | 37.00 | 41800.00 | 2.00 | 13.00 | 0.00 | 1.00 |
| max | 3736928711.00 | 2022.00 | 5.00 | 6.00 | 4.00 | 1365000.00 | 5.00 | 2.00 | 2.00 | 12.00 | 50.00 | 319995.00 | 2.00 | 121.00 | 1.00 | 1.00 |
Price has extreme values in data.\ Car_age has -1 value, which should not exist. \ Odometer has extreme values in data.
Odometer shuould have max 200,000 miles. \ Standard cars in this day and age are expected to keep running up to 200,000 miles. \ But given summary statistics, it was shown that it has data over 200,000 upto 3736928711.
We will look at distribution for odometer.
sns.distplot(train[train['odometer'].notnull()]['odometer'])
sns.set(rc={'figure.figsize':(5,5)})
train['odometer'].describe()
print(f"\n\nSkewness for odometer: {round(train['odometer'].skew(),2)}\n\n")
Skewness for odometer: 3.04
As expected, it is highly skewed to the right so needs to handle outliers.
#using feature_engineer, trim outliers using IQR limits
from feature_engine.outliers import OutlierTrimmer
# set up the capper
capper = OutlierTrimmer(capping_method='iqr', tail='right', fold=1.5, variables=['odometer'])
# fit the capper
capper.fit(train)
OutlierTrimmer(capping_method='iqr', fold=1.5, variables=['odometer'])
IQR limits:\ right tail: 75th quantile + 3 IQR\ left tail: 25th quantile - 3 IQR
capper.right_tail_caps_
{'odometer': 267867.25}
train2= capper.transform(train)
train2['odometer'].max()
267842
Revisit the distribution after outlier trimming
sns.distplot(train2[train2['odometer'].notnull()]['odometer'])
sns.set(rc={'figure.figsize':(5,5)})
train2['odometer'].describe()
print(f"\n\nSkewness for odometer: {round(train2['odometer'].skew(),2)}\n\n")
Skewness for odometer: 0.3
Skewness is reduced to 0.3 and distribution looks approximately normal.
From the distribution, you can see that there is 0 for odometer value. Since it is a used car market, having 0 for odometer is weird.
Assuming new car can have 0 odometer. The below case makes sense.
train2[(train2['odometer']==0)&(train2['condition']==5)]
| price | year | condition | cylinders | fuel | odometer | title_status | transmission | drive | type | state | MSRP | quarter | car_age | is_vintage | is_color_neutral | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 102845 | 3500 | 2016 | 5 | 1 | 4 | 0 | 0 | 2 | 0 | 7 | 33 | 24340 | 2 | 5 | 0 | 0 |
This is subject to concern but possible too. Condition of 4 is like new.
train2[(train2['odometer']==0)&(train2['condition']==4)]
| price | year | condition | cylinders | fuel | odometer | title_status | transmission | drive | type | state | MSRP | quarter | car_age | is_vintage | is_color_neutral | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 158252 | 34999 | 2021 | 4 | 1 | 2 | 0 | 0 | 0 | 2 | 10 | 43 | 17990 | 2 | 0 | 0 | 1 |
| 168760 | 20998 | 2021 | 4 | 1 | 2 | 0 | 0 | 0 | 1 | 0 | 47 | 22195 | 2 | 0 | 0 | 1 |
Drop rows that have odometer 0 and condition less than 4 because if such the case, odometer should be above 0 as it is a used/droven car.
train3 = train2[~((train2['odometer']==0)&(train2['condition']<4))]
Look at summary stats again.
train3.describe()
| price | year | condition | cylinders | fuel | odometer | title_status | transmission | drive | type | state | MSRP | quarter | car_age | is_vintage | is_color_neutral | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 175468.00 | 175468.00 | 175468.00 | 175468.00 | 175468.00 | 175468.00 | 175468.00 | 175468.00 | 175468.00 | 175468.00 | 175468.00 | 175468.00 | 175468.00 | 175468.00 | 175468.00 | 175468.00 |
| mean | 66495.01 | 2011.44 | 0.96 | 2.36 | 2.03 | 100589.34 | 0.14 | 0.10 | 0.63 | 5.56 | 23.75 | 33549.30 | 2.00 | 9.56 | 0.00 | 0.49 |
| std | 13066483.61 | 7.11 | 0.97 | 1.23 | 0.45 | 56747.62 | 0.76 | 0.35 | 0.70 | 4.30 | 14.97 | 14622.17 | 0.00 | 7.11 | 0.07 | 0.50 |
| min | 0.00 | 1900.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 2000.00 | 2.00 | -1.00 | 0.00 | 0.00 |
| 25% | 4600.00 | 2008.00 | 0.00 | 1.00 | 2.00 | 55787.00 | 0.00 | 0.00 | 0.00 | 0.00 | 9.00 | 24340.00 | 2.00 | 5.00 | 0.00 | 0.00 |
| 50% | 9999.00 | 2013.00 | 1.00 | 3.00 | 2.00 | 98880.00 | 0.00 | 0.00 | 1.00 | 8.00 | 23.00 | 30225.00 | 2.00 | 8.00 | 0.00 | 0.00 |
| 75% | 19000.00 | 2016.00 | 2.00 | 3.00 | 2.00 | 140000.00 | 0.00 | 0.00 | 1.00 | 9.00 | 37.00 | 41800.00 | 2.00 | 13.00 | 0.00 | 1.00 |
| max | 3736928711.00 | 2022.00 | 5.00 | 6.00 | 4.00 | 267842.00 | 5.00 | 2.00 | 2.00 | 12.00 | 50.00 | 319995.00 | 2.00 | 121.00 | 1.00 | 1.00 |
Now odometer has more reasonable stats now.
Price is also another variable that shows extreme values. \ Since we are looking at used car market, based on the business purpose, we need to find cars that might not be so suitable for our analysis.
sns.distplot(train3[train3['price'].notnull()]['price'])
sns.set(rc={'figure.figsize':(5,5)})
train3['price'].describe()
print(f"\n\nSkewness for price: {round(train3['odometer'].skew(),2)}\n\n")
Skewness for price: 0.3
Skewness is 0.3 for price column.
We will drop cars that are below 1000 and above 200000. \ Cars below 1000 are highly likely posting errors especially with cars of recent years: seller might have capitalized on listing it for low price and getting buyers' attention. \ Cars above 200000: normally used car buyer would not look at purchasing 200000 priced cars. It is a price to buy new Porche.
train3[train3['price']<1000]
| price | year | condition | cylinders | fuel | odometer | title_status | transmission | drive | type | state | MSRP | quarter | car_age | is_vintage | is_color_neutral | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | 0 | 2011 | 0 | 1 | 2 | 99615 | 0 | 0 | 0 | 0 | 1 | 21940 | 2 | 10 | 0 | 0 |
| 8 | 0 | 2018 | 3 | 3 | 2 | 68472 | 0 | 0 | 2 | 11 | 1 | 30745 | 2 | 3 | 0 | 1 |
| 9 | 0 | 2019 | 3 | 3 | 2 | 69125 | 0 | 0 | 2 | 11 | 1 | 30745 | 2 | 2 | 0 | 1 |
| 10 | 0 | 2018 | 3 | 3 | 2 | 66555 | 0 | 0 | 2 | 11 | 1 | 30745 | 2 | 3 | 0 | 1 |
| 19 | 0 | 2015 | 0 | 1 | 2 | 99505 | 0 | 0 | 1 | 9 | 1 | 20720 | 2 | 6 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 177378 | 0 | 2017 | 0 | 4 | 2 | 62000 | 0 | 0 | 0 | 10 | 50 | 33210 | 2 | 4 | 0 | 1 |
| 177380 | 0 | 2012 | 0 | 4 | 2 | 97000 | 0 | 0 | 0 | 10 | 50 | 33210 | 2 | 9 | 0 | 1 |
| 177382 | 800 | 2000 | 0 | 4 | 2 | 100000 | 0 | 0 | 2 | 3 | 50 | 49150 | 2 | 21 | 0 | 1 |
| 177400 | 0 | 2004 | 0 | 1 | 3 | 239000 | 0 | 0 | 1 | 0 | 50 | 25765 | 2 | 17 | 0 | 0 |
| 177413 | 0 | 2010 | 0 | 3 | 2 | 155000 | 0 | 0 | 0 | 0 | 50 | 31510 | 2 | 11 | 0 | 0 |
22758 rows × 16 columns
train3[train3['price']>200000]
| price | year | condition | cylinders | fuel | odometer | title_status | transmission | drive | type | state | MSRP | quarter | car_age | is_vintage | is_color_neutral | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 746 | 99999999 | 1993 | 1 | 1 | 2 | 99999 | 0 | 0 | 0 | 0 | 1 | 17990 | 2 | 28 | 0 | 0 |
| 3857 | 229500 | 2018 | 0 | 4 | 2 | 7910 | 5 | 0 | 2 | 2 | 3 | 280900 | 2 | 3 | 0 | 0 |
| 5937 | 239995 | 2017 | 0 | 4 | 2 | 6450 | 0 | 0 | 2 | 3 | 3 | 252800 | 2 | 4 | 0 | 0 |
| 6632 | 239995 | 2017 | 0 | 4 | 2 | 6012 | 0 | 0 | 2 | 3 | 3 | 252800 | 2 | 4 | 0 | 0 |
| 17222 | 347999 | 2020 | 0 | 4 | 2 | 3000 | 0 | 0 | 2 | 3 | 4 | 276550 | 2 | 1 | 0 | 0 |
| 17522 | 349999 | 2020 | 0 | 4 | 2 | 2800 | 0 | 0 | 2 | 3 | 4 | 276550 | 2 | 1 | 0 | 0 |
| 34512 | 304995 | 2021 | 0 | 3 | 2 | 22 | 0 | 0 | 2 | 2 | 7 | 96200 | 2 | 0 | 0 | 1 |
| 35285 | 304995 | 2021 | 0 | 3 | 2 | 22 | 0 | 0 | 2 | 2 | 7 | 96200 | 2 | 0 | 0 | 1 |
| 39932 | 204999 | 2015 | 0 | 6 | 2 | 30920 | 0 | 2 | 2 | 3 | 9 | 319995 | 2 | 6 | 0 | 0 |
| 40900 | 204999 | 2015 | 0 | 6 | 2 | 30920 | 0 | 2 | 2 | 3 | 9 | 319995 | 2 | 6 | 0 | 0 |
| 41884 | 6995495 | 2014 | 1 | 1 | 2 | 135888 | 0 | 0 | 1 | 0 | 9 | 33295 | 2 | 7 | 0 | 0 |
| 46537 | 204999 | 2015 | 0 | 6 | 2 | 30920 | 0 | 2 | 2 | 3 | 9 | 319995 | 2 | 6 | 0 | 0 |
| 50990 | 225995 | 2016 | 0 | 3 | 2 | 159 | 0 | 2 | 2 | 3 | 11 | 96200 | 2 | 5 | 0 | 0 |
| 51303 | 289995 | 2016 | 0 | 6 | 2 | 3675 | 0 | 1 | 2 | 3 | 11 | 319995 | 2 | 5 | 0 | 1 |
| 51751 | 289900 | 2018 | 0 | 4 | 2 | 3300 | 0 | 0 | 2 | 2 | 11 | 280900 | 2 | 3 | 0 | 1 |
| 53791 | 425000 | 1993 | 0 | 3 | 2 | 380 | 0 | 0 | 0 | 0 | 13 | 23995 | 2 | 28 | 0 | 0 |
| 55532 | 123456789 | 1999 | 3 | 3 | 2 | 96000 | 0 | 0 | 1 | 9 | 13 | 28565 | 2 | 22 | 0 | 1 |
| 59120 | 227995 | 2017 | 0 | 4 | 2 | 13828 | 0 | 0 | 2 | 0 | 14 | 252800 | 2 | 4 | 0 | 0 |
| 60443 | 566567 | 2006 | 1 | 3 | 2 | 130000 | 0 | 0 | 0 | 9 | 15 | 37570 | 2 | 15 | 0 | 0 |
| 62301 | 1234567 | 2006 | 3 | 3 | 2 | 123456 | 0 | 0 | 0 | 0 | 15 | 23995 | 2 | 15 | 0 | 0 |
| 66268 | 215000 | 2015 | 1 | 3 | 2 | 157387 | 0 | 0 | 0 | 0 | 12 | 69480 | 2 | 6 | 0 | 0 |
| 75942 | 1410065407 | 1989 | 1 | 3 | 0 | 103000 | 0 | 1 | 0 | 0 | 20 | 23995 | 2 | 32 | 0 | 0 |
| 79549 | 123456789 | 2015 | 3 | 1 | 2 | 64181 | 0 | 0 | 1 | 9 | 22 | 16170 | 2 | 6 | 0 | 0 |
| 88649 | 1111111 | 1999 | 1 | 4 | 2 | 200000 | 0 | 0 | 0 | 3 | 23 | 68160 | 2 | 22 | 0 | 0 |
| 90872 | 1111111 | 1970 | 1 | 4 | 2 | 42000 | 0 | 0 | 2 | 3 | 25 | 31995 | 2 | 51 | 1 | 0 |
| 132514 | 229500 | 2018 | 0 | 4 | 2 | 7910 | 5 | 0 | 2 | 2 | 37 | 280900 | 2 | 3 | 0 | 0 |
| 133792 | 3736928711 | 2007 | 0 | 4 | 2 | 164000 | 0 | 0 | 0 | 8 | 37 | 33210 | 2 | 14 | 0 | 1 |
| 146666 | 229500 | 2018 | 0 | 4 | 2 | 7910 | 5 | 0 | 2 | 2 | 42 | 280900 | 2 | 3 | 0 | 0 |
| 148256 | 3736928711 | 1999 | 1 | 3 | 2 | 211000 | 0 | 0 | 0 | 0 | 42 | 41365 | 2 | 22 | 0 | 0 |
| 158250 | 25003000 | 1991 | 1 | 3 | 2 | 200000 | 0 | 0 | 2 | 3 | 43 | 29205 | 2 | 30 | 0 | 0 |
| 167266 | 304995 | 2021 | 0 | 3 | 2 | 22 | 0 | 0 | 2 | 2 | 45 | 96200 | 2 | 0 | 0 | 1 |
| 170998 | 225000 | 1969 | 0 | 4 | 2 | 33000 | 0 | 1 | 1 | 3 | 47 | 41800 | 2 | 52 | 1 | 1 |
| 171518 | 225000 | 1969 | 0 | 4 | 2 | 33000 | 0 | 1 | 1 | 3 | 47 | 41800 | 2 | 52 | 1 | 1 |
Using feature_engine outlier trimmer, trim outliers on dataset.
from feature_engine.outliers import ArbitraryOutlierCapper
# set up the capper
capper = ArbitraryOutlierCapper(max_capping_dict={'price': 200000}, min_capping_dict={'price':1000})
# fit the capper
capper.fit(train3)
train4= capper.transform(train3)
Relook at distribution. Skewness increased for some reason but we will decide how to handle this after running the models. \ -> after running the models, we decided not to transform to reduce skewness. Reducing the skewness using log made models results worse.
# price outlier
lower = 0.001
higher = 0.999
low, high = train4['price'].quantile([lower, higher])
print(low, high)
1000.0 99997.1320000001
train4 = train4[(train4['price']>low) & (train4['price']<high)]
train4['depreciation'] = train4['MSRP']-train['price']
train4.head(2)
| price | year | condition | cylinders | fuel | odometer | title_status | transmission | drive | type | state | MSRP | quarter | car_age | is_vintage | is_color_neutral | depreciation | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 22590 | 2010 | 2 | 4 | 2 | 71229 | 0 | 2 | 0 | 8 | 1 | 46110 | 2 | 11 | 0 | 0 | 23520.00 |
| 1 | 35000 | 2019 | 0 | 3 | 2 | 43000 | 0 | 0 | 0 | 10 | 1 | 27355 | 2 | 2 | 0 | 1 | -7645.00 |
sns.distplot(train4[train4['price'].notnull()]['price'])
sns.set(rc={'figure.figsize':(5,5)})
train4['price'].describe()
print(f"\n\nSkewness for price: {round(train4['price'].skew(),2)}\n\n")
Skewness for price: 1.69
We found that some data has MSRP > price. MSRP is the initall selling price for new cars. \ Decided to drop abnormal data that has MSRP > price.
sns.distplot(train4[train4['MSRP'].notnull()]['MSRP'])
sns.set(rc={'figure.figsize':(5,5)})
train4['MSRP'].describe()
print(f"\n\nSkewness for MSRP: {round(train4['price'].skew(),2)}\n\n")
Skewness for MSRP: 1.69
# MSRP outlier
lower = 0.001
higher = 0.999
low, high = train4['MSRP'].quantile([lower, higher])
print(low, high)
2000.0 111510.0
train4 = train4[(train4['MSRP']>low) & (train4['MSRP']<high)]
# depreciation outlier
lower = 0.1
higher = 0.9
low, high = train4['depreciation'].quantile([lower, higher])
print(low, high)
738.5 36590.0
train4 = train4[(train4['depreciation']>low) & (train4['depreciation']<high)]
sns.distplot(train4[train4['depreciation'].notnull()]['depreciation'])
sns.set(rc={'figure.figsize':(5,5)})
train4['depreciation'].describe()
print(f"\n\nSkewness for depreciation: {round(train4['depreciation'].skew(),2)}\n\n")
Skewness for depreciation: 0.14
train4 = train4.drop('depreciation', axis=1)
Initially we transformed feature into quarter but realized that the dataset's quarter is all 2. This is not helpful for us to see any cyclical pattern so dropped it here.
train5 = train4.drop('quarter', axis=1)
train5.describe()
| price | year | condition | cylinders | fuel | odometer | title_status | transmission | drive | type | state | MSRP | car_age | is_vintage | is_color_neutral | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 120991.00 | 120991.00 | 120991.00 | 120991.00 | 120991.00 | 120991.00 | 120991.00 | 120991.00 | 120991.00 | 120991.00 | 120991.00 | 120991.00 | 120991.00 | 120991.00 | 120991.00 |
| mean | 13670.31 | 2011.33 | 0.99 | 2.15 | 2.04 | 106197.57 | 0.17 | 0.09 | 0.66 | 5.72 | 23.68 | 31328.85 | 9.67 | 0.00 | 0.50 |
| std | 9474.10 | 6.35 | 0.96 | 1.18 | 0.38 | 54160.41 | 0.83 | 0.35 | 0.67 | 4.28 | 14.88 | 9997.92 | 6.35 | 0.06 | 0.50 |
| min | 1050.00 | 1905.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 2130.00 | -1.00 | 0.00 | 0.00 |
| 25% | 6500.00 | 2008.00 | 0.00 | 1.00 | 2.00 | 66246.00 | 0.00 | 0.00 | 0.00 | 0.00 | 9.00 | 24340.00 | 5.00 | 0.00 | 0.00 |
| 50% | 11380.00 | 2013.00 | 1.00 | 3.00 | 2.00 | 104674.00 | 0.00 | 0.00 | 1.00 | 8.00 | 23.00 | 29735.00 | 8.00 | 0.00 | 1.00 |
| 75% | 18588.00 | 2016.00 | 2.00 | 3.00 | 2.00 | 143296.50 | 0.00 | 0.00 | 1.00 | 9.00 | 37.00 | 37590.00 | 13.00 | 0.00 | 1.00 |
| max | 89950.00 | 2022.00 | 5.00 | 6.00 | 4.00 | 267842.00 | 5.00 | 2.00 | 2.00 | 12.00 | 50.00 | 102100.00 | 116.00 | 1.00 | 1.00 |
train5 = train5[~train5['car_age']<0]
train.corr()['price'].drop('price').sort_values().hvplot.barh(
title="Correlation between price and features",
ylabel="Correlation", xlabel="Numeric Features"
)
train5.corr()['price'].drop('price').sort_values().hvplot.barh(
title="Correlation between price and features",
ylabel="Correlation", xlabel="Numeric Features"
)